There are 11 trees that were built with iqtree from the T-shaped alignments. They constitute whole genome, 10% wgs-90% pol, 20% wgs-80% pol, and so on up until 100% pol. There are 1196 tips on these trees, which comprise all subtype B.
pol and the mixtures with clusters identified with pol, there is a dip to wgs60, and then congruence increases again. (Figures 3.2 and 3.1)We put the trees into a dataframe that has as columns: the percent of full genome (0 through 100), the partition (as produced by prop.part), the bootstrap value of that partition (NA if doesn’t exist in tree for percent of full genome, otherwise bootstrap value)
# inputs:
# bipart - bipartition with labels
# t - tree index
bootstrap_helper %<-% function(bipart, t) {
if(is.monophyletic(phylos[[t]],bipart)) {
mrca <- ggtree::MRCA(trees[[t]], bipart)
boot <- dplyr::filter(trees[[t]]@data, node==mrca)$UFboot
return(list(bipart, boot))
}
}
#bootstrap %<-% mclapply(labels, function(x) bootstrap_helper(x, 1))
bootstrap <- function(t) {
future_lapply(labels, function(x) bootstrap_helper(x, t))
}
all_tree_bootstraps <- lapply(1:length(trees), bootstrap)
saveRDS(all_tree_bootstraps, file=file.path(PATH,"all_tree_bootstraps_subtypeB.rds"))
# it looks like each element in each list in all_tree_bootstraps is the same partition which is nice. So we want to save the name of the partition, and the number if it's the second value, or NA if it doesn't exist/the value is NULL
# labels is the actual list of labels
# in order to do that we have to first create a list replacement so that each element of the list has 2 elements itself
replace_null <- function(x) {
# get number of NULL values in bootstrap list
num_null <- sapply(x, is.null) %>% sum()
# make list that we will use to replace, split forms them into pairs
list_replace <- rep(list(NA,NA), num_null)
list_replace <- split(list_replace, factor(c(1:num_null,1:num_null)))
x[sapply(x,is.null)] <- list_replace
return(x)
}
only_bootstraps <- lapply(all_tree_bootstraps, function(x)
sapply(replace_null(x), "[[", 2))
names(only_bootstraps) <- treenames
bootstrap_df <- data.frame(only_bootstraps)
bootstrap_df$labelind <- 1:nrow(bootstrap_df)
We keep the labels separate from the dataframe for now because the names are extremely messy to keep as rows.
As a quick sanity check, we compute the RF distance between all of our phylogenetic trees and create a sample-to-sample heatmap for them. It does appear that the pol tree is most similar to the pol10-wgs90 tree, and that the RF distance increases as the wgs proportion increases.
just_trees <- phylos[c(11,1,2,3,4,5,6,7,8,9,10)]
class(just_trees) <- "multiPhylo"
tree_rfdist <- dist.topo(just_trees)
ggplot(melt(as.matrix(tree_rfdist)), aes(X1, X2)) + geom_tile(aes(fill=value)) + coord_fixed() + scale_fill_viridis_c()
(#fig:rf_dist)Sample to sample RF distances between all phylogenetic trees. Closer to yellow is farther away, and closer to green is closer.
Idea was we are going to cluster the pol tree and annotate on that plot made of the bootstraps.
# spotchecking the results of this it does look ok but should still write a test tree and be careful...
poltree <- trees[[10]]
pol_bootstrap99 <- filter(poltree@data, UFboot > 99)$node
copy_pol_bootstrap99 <- pol_bootstrap99
# to cluster, for a given node with UFboot > 99, are its children also UFboot > 99?
# we can start at the beginning of the vector since those are the largest subtrees / closest to the root and delete the children from the list until we hit a leaf / there are no more children
i <- 1
node <- pol_bootstrap99[i]
tips <- 1:3860
clusters <- c()
while(!is.na(node)) {
subnodes <- offspring(poltree@phylo, node)
subnodes <- subnodes[-which(subnodes %in% tips)] # remove tips
#print("subnodes")
#print(subnodes)
# if all subnodes are in the set of nodes with bootstrap > 99, delete the subnodes and add node
# to set of clusters
if (length(subnodes)==0) {
clusters <- c(clusters, node)
} else if (all(subnodes %in% pol_bootstrap99)) {
copy_pol_bootstrap99 <- copy_pol_bootstrap99[-which(copy_pol_bootstrap99 %in% subnodes)]
#print("copy_pol_bootstrap99")
#print(copy_pol_bootstrap99)
clusters <- c(clusters, node)
}
i <- i + 1
node <- copy_pol_bootstrap99[i]
#print(node)
}
polclust_tips <- lapply(clusters, function(x) offspring(poltree@phylo, x, tiponly=TRUE))
polclust_labels <- lapply(polclust_tips, function(x) poltree@phylo$tip.label[x] %>% sort)
polclust_index <- sapply(polclust_labels, function(x) which(sapply(labels, identical, x)))
We also ID pol clusters using clusterpicker and a genetic distance of 0.045 along with a bootstrap value of 99. There are 139 clusters ID’d that way.
clusters_99_045_labels <- filter(polclust_99_045, ClusterNumber != -1) %>% group_by(ClusterNumber) %>% summarise(SequenceName = paste0(SequenceName, collapse=" ")) %>% .$SequenceName
clusters_99_045_labels <- lapply(clusters_99_045_labels, function(x) strsplit(x, " ")[[1]] %>% sort())
clusters_99_045_index <- sapply(clusters_99_045_labels, function(x) which(sapply(labels, identical, x)))
wgsclades <- filter(bootstrap_df, !is.na(wgs)) %>%
pivot_longer(cols=wgs90:pol, names_to="tree",values_to="bootstrap")
wgsclades$polclust_99 <- wgsclades$labelind %in% polclust_index
wgsclades$polclust_99_045 <- wgsclades$labelind %in% clusters_99_045_index
num_pol_in_wgs <- sum(polclust_index %in% wgsclades$labelind)
num_pol99_045_in_wgs <- sum(clusters_99_045_index %in% wgsclades$labelind)
cc <- scales::seq_gradient_pal("white", "blue", "Lab")(seq(0,1,length.out=10))
ggplot(wgsclades,
aes(x=wgs,y=bootstrap,color=tree), alpha=0.3) +
geom_jitter(data=filter(wgsclades,labelind %in% polclust_index),
aes(x=wgs,y=bootstrap),color="black",size=4.5,alpha=0.5) +
geom_jitter(data=filter(wgsclades,labelind %in% clusters_99_045_index),
aes(x=wgs,y=bootstrap),color="red",size=4.5,alpha=0.5) +
geom_jitter() +
scale_color_manual(values=cc, labels=c("pol", "wgs10-pol90", "wgs20-pol80",
"wgs30-pol70","wgs40-pol60","wgs50-pol50",
"wgs60-pol40","wgs70-pol30","wgs80-pol20",
"wgs90-pol10")) +
xlab("WGS Bootstrap") +
ylab("Mixture Bootstrap") +
ggtitle("WGS Bootstrap vs Mixture Bootstrap")
Ok, now that we have the clusters that are ID’d as transmission clusters in pol with a 99% bootstrap cutoff, we want to label them on the whole genome vs proportion plot. First we want to recreate that plot using ggplot though. Basically if a clade is in the whole alignment, plot its bootstrap value as well as any for the other trees.
There are 161 clusters in the pol tree, and 101 of them are in the wgs tree, which works out to 0.6273292 of them.
This plot is not so informative ??, unfortunately I think.
bootstrap_df_polclustonly <- filter(bootstrap_df, labelind %in% polclust_index) %>%
pivot_longer(wgs90:wgs, names_to="tree", values_to="bootstrap") %>%
mutate(bootbins=case_when(
bootstrap < 95 ~ "<95",
bootstrap == 95 ~ "95",
bootstrap == 96 ~ "96",
bootstrap == 97 ~ "97",
bootstrap == 98 ~ "98",
bootstrap == 99 ~ "99",
bootstrap == 100 ~ "100",
is.na(bootstrap) ~ 'NA'
))
ggplot(bootstrap_df_polclustonly, aes(x=factor(tree, level=c('pol','wgs10','wgs20','wgs30','wgs40','wgs50','wgs60','wgs70','wgs80','wgs90','wgs')),y=factor(bootbins,level=c('NA','<95','95','96','97','98','99','100')))) +
geom_bin_2d() + scale_fill_viridis_c() +
guides(fill=guide_legend(title="Number of nodes")) +
stat_bin_2d(geom="text",aes(label=..count..)) +
xlab("Tree") +
ylab("Bootstrap") +
ggtitle("Bootstrap values of pol clusters on different tree mixtures") +
labs(caption = "clusters not found on tree were counted as NA")
(#fig:heatmap_polclust_bootstrap)Bootstrap values of pol clusters on different trees. pol clusters that were not found on the tree were counted as NA. After wgs10, the wgs tree had the most clades shared with the pol clusters with a high bootstrap value.
bootstrap_df_clusterpickeronly <- filter(bootstrap_df, labelind %in% clusters_99_045_index) %>%
pivot_longer(wgs90:wgs, names_to="tree", values_to="bootstrap") %>%
mutate(bootbins=case_when(
bootstrap < 95 ~ "<95",
bootstrap == 95 ~ "95",
bootstrap == 96 ~ "96",
bootstrap == 97 ~ "97",
bootstrap == 98 ~ "98",
bootstrap == 99 ~ "99",
bootstrap == 100 ~ "100",
is.na(bootstrap) ~ 'NA'
))
ggplot(bootstrap_df_clusterpickeronly, aes(x=factor(tree, level=c('pol','wgs10','wgs20','wgs30','wgs40','wgs50','wgs60','wgs70','wgs80','wgs90','wgs')),y=factor(bootbins,level=c('NA','<95','95','96','97','98','99','100')))) +
geom_bin_2d() + scale_fill_viridis_c() +
guides(fill=guide_legend(title="Number of nodes")) +
stat_bin_2d(geom="text",aes(label=..count..)) +
xlab("Tree") +
ylab("Bootstrap") +
ggtitle("Bootstrap values of pol clusters with 0.045 genetic distance and 99 bootstrap on different tree mixtures") +
labs(caption = "clusters not found on tree were counted as NA")
Figure 3.1: Bootstrap values of pol clusters from clusterpicker with 0.045 genetic distance on different trees. pol clusters that were not found on the tree were counted as NA. After wgs10, the wgs tree had the most clades shared with the pol clusters with a high bootstrap value.
The lineplot (Figure~3.2) is the same data as the heatmap, but I only plotted as a line the clades on the tree that are >99, and then those that are NA.
lineplot_df_clusterpicker <- bootstrap_df_clusterpickeronly %>% filter(bootstrap >= 99 | is.na(bootstrap)) %>% transmute(tree=tree, bootbins2=case_when(bootstrap >= 99 ~ ">99")) %>% count(tree, bootbins2)
ggplot(lineplot_df_clusterpicker, aes(x=factor(tree, level=c('pol','wgs10','wgs20','wgs30','wgs40','wgs50','wgs60','wgs70','wgs80','wgs90','wgs')), y=n)) + geom_line(aes(color=bootbins2, group=bootbins2)) +
xlab("Tree") +
ylab("Count") +
guides(color=guide_legend(title="Bootstrap value of node")) +
ggtitle("Number of clades from pol tree with given bootstrap value (>99 or NA) on each tree")
Figure 3.2: Number of clusters from clusterpicker with 0.045 and 99% bootstrap on each tree
So I think there is definitely low congruence between the pol tree and the wgs tree. Some other notes:
Possible explanations:
One idea we had was to plot the genetic distance of the pol clades. We compute the genetic distance using the TN93 model, and look at the maximum genetic distance in a given cluster (Chosen based on clusterpicker paper) and then plot the genetic distance distributions for pol for those with:
bootstrap_ge95_wgs <- labels[filter(bootstrap_df_clusterpickeronly, tree=="wgs" & bootstrap>=95)$labelind]
bootstrap_l95_wgs <- labels[filter(bootstrap_df_clusterpickeronly, tree=="wgs" & bootstrap<95)$labelind]
bootstrap_na_wgs <- labels[filter(bootstrap_df_clusterpickeronly, is.na(bootstrap) & tree=="wgs")$labelind]
# Given list of cluster labels and ape.dist object, compute maximum genetic distance
# for given cluster
max_gendist <- function(clusters, dist) {
sapply(clusters, function(x) max_gendist_helper(x, dist))
}
max_gendist_helper <- function(cluster, dist) {
i <- which(rownames(dist) %in% cluster) # find rows with cluster label
combos <- combn(i, 2) # make all combos of 2
d <- dist[t(combos)] # get distances for all combos (goes by 2)
return(max(d))
}
dists_ge95_wgs <- max_gendist(bootstrap_ge95_wgs, adists$wgs)
dists_l95_wgs <- max_gendist(bootstrap_l95_wgs, adists$wgs)
dists_na_wgs <- max_gendist(bootstrap_na_wgs, adists$wgs)
dist_df <- data.frame(cat=c(rep(">=95 bootstrap", length(dists_ge95_wgs)),
rep("<95 bootstrap", length(dists_l95_wgs)),
rep("NA", length(dists_na_wgs))),
dist=c(dists_ge95_wgs,dists_l95_wgs,dists_na_wgs))
#ggplot(dist_df, aes(x=dist)) + geom_freqpoly(aes(color=cat))
# contains for wgs as well, a bit redundant but it's fine
dist_mask_df_list <- lapply(treenames[c(1:9,11)], function(x) {
bge95 <- labels[filter(bootstrap_df_clusterpickeronly, tree==x & bootstrap>=95)$labelind]
bl95 <- labels[filter(bootstrap_df_clusterpickeronly, tree==x & bootstrap<95)$labelind]
bna <- labels[filter(bootstrap_df_clusterpickeronly, is.na(bootstrap) & tree==x)$labelind]
dge95 <- max_gendist(bge95, adists[[x]])
dl95 <- max_gendist(bl95, adists[[x]])
dna <- max_gendist(bna, adists[[x]])
return(data.frame(cat=c(rep(">=95 bootstrap", length(dge95)),
rep("<95 bootstrap", length(dl95)),
rep("NA", length(dna))),
dist=c(dge95,dl95,dna)))
}
)
names(dist_mask_df_list) <- treenames[c(1:9,11)]
dist_df <- bind_rows(dist_mask_df_list, .id="tree")
ggplot(dist_df, aes(x=dist)) + geom_freqpoly(aes(color=cat)) + facet_wrap(vars(tree)) +
geom_vline(xintercept=0.045, linetype=3) +
ggtitle("Frequency of genetic distance for pol clusters on a given tree")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Figure 3.3: Frequency of genetic distance on each tree of pol clusters with genetic distance 0.045 and bootstrap 99. Clusters were divided into three categories: green is those with >=95 bootstrap on the given tree, red is those with <95 bootstrap, and blue is those not found in the given tree.
Figure 3.3 shows these genetic distance distributions. It seems interesting that most of the clusters not found on the whole genome tree actually also have greater genetic distance, while for the other trees genetic distance for clusters with >=95 bootstrap and clusters not present is pretty much the same.
# redo for bootstrap >=99
dist_mask_df_list <- lapply(treenames[c(1:9,11)], function(x) {
bge95 <- labels[filter(bootstrap_df_clusterpickeronly, tree==x & bootstrap>=99)$labelind]
bl95 <- labels[filter(bootstrap_df_clusterpickeronly, tree==x & bootstrap<99)$labelind]
bna <- labels[filter(bootstrap_df_clusterpickeronly, is.na(bootstrap) & tree==x)$labelind]
dge95 <- max_gendist(bge95, adists[[x]])
dl95 <- max_gendist(bl95, adists[[x]])
dna <- max_gendist(bna, adists[[x]])
return(data.frame(cat=c(rep(">=95 bootstrap", length(dge95)),
rep("<95 bootstrap", length(dl95)),
rep("NA", length(dna))),
dist=c(dge95,dl95,dna)))
}
)
names(dist_mask_df_list) <- treenames[c(1:9,11)]
dist_df_99 <- bind_rows(dist_mask_df_list, .id="tree")
ggplot(dist_df_99, aes(x=dist)) + geom_freqpoly(aes(color=cat)) + facet_wrap(vars(tree))
todo: cluster all of them by bootstrap value and then do an upset plot comparing them
# spotchecking the results of this it does look ok but should still write a test tree and be careful...
poltree <- trees[[10]]
pol_bootstrap99 <- filter(poltree@data, UFboot > 99)$node
copy_pol_bootstrap99 <- pol_bootstrap99
# to cluster, for a given node with UFboot > 99, are its children also UFboot > 99?
# we can start at the beginning of the vector since those are the largest subtrees / closest to the root and delete the children from the list until we hit a leaf / there are no more children
i <- 1
node <- pol_bootstrap99[i]
tips <- 1:3860
clusters <- c()
while(!is.na(node)) {
subnodes <- offspring(poltree@phylo, node)
subnodes <- subnodes[-which(subnodes %in% tips)] # remove tips
#print("subnodes")
#print(subnodes)
# if all subnodes are in the set of nodes with bootstrap > 99, delete the subnodes and add node
# to set of clusters
if (length(subnodes)==0) {
clusters <- c(clusters, node)
} else if (all(subnodes %in% pol_bootstrap99)) {
copy_pol_bootstrap99 <- copy_pol_bootstrap99[-which(copy_pol_bootstrap99 %in% subnodes)]
#print("copy_pol_bootstrap99")
#print(copy_pol_bootstrap99)
clusters <- c(clusters, node)
}
i <- i + 1
node <- copy_pol_bootstrap99[i]
#print(node)
}
polclust_tips <- lapply(clusters, function(x) offspring(poltree@phylo, x, tiponly=TRUE))
polclust_labels <- lapply(polclust_tips, function(x) poltree@phylo$tip.label[x] %>% sort)
polclust_index <- sapply(polclust_labels, function(x) which(sapply(labels, identical, x)))
One idea was to look into the branch length distribution for the nodes that were in each quadrant to see if there was a relationship there. To do this we first split them into the different quadrants.
zones <- mutate(wgsclades, zone = case_when(
wgs < 95 & bootstrap < 95 ~ "danger",
wgs > 95 & bootstrap < 95 ~ "wgshigh",
wgs > 95 & bootstrap > 95 ~ "allhigh",
wgs < 95 & bootstrap < 95 ~ "alllow"
)) %>% filter(!is.na(zone))
# we have a sad, because future_lapply doesn't work on offspring, same issue with the no applicable
# method thing. Stop here for today.
nodes_to_labels <- function(phylo, labels) {
sapply(3862:(3858+3860), function(z) {
print(z)
tipind <- tidytree::offspring(phylo, z, tiponly=TRUE) # get indices of tips
print(tipind)
labind <- phylo$tip.label[tipind] %>% sort %>% # get labels of tips
sapply(labels, identical, .) %>%
which # get index of clade in labels vector
})
}
multiphylos <- c(rtree(10),rtree(10))
ggplot(as_tibble(phylos[[3]]), aes(x=branch.length)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
names(trees) <- c(90,80,70,60,50,40,30,20,10,0,100)
heatmap_df <- lapply(trees, function(x) x@data) %>% bind_rows(.id="id")
heatmap_df$id <- factor(heatmap_df$id, order=TRUE, levels=c(0,10,20,30,40,50,60,70,80,90,100))
heatmap_df <- mutate(heatmap_df, bootbins=case_when(
UFboot < 95 ~ "<95",
UFboot == 95 ~ "95",
UFboot == 96 ~ "96",
UFboot == 97 ~ "97",
UFboot == 98 ~ "98",
UFboot == 99 ~ "99",
UFboot == 100 ~ "100"
))
p <- ggplot(heatmap_df, aes(x=id,y=UFboot)) + geom_bin_2d() +
xlab("Percentage of whole genome") +
ylab("Bootstrap values") +
ggtitle("Number of nodes with y bootstrap value in tree \n
with x percent of tips as whole genome sequences") +
guides(fill=guide_legend(title="Number of nodes"))
ggplotly(p)
## Warning: Removed 11 rows containing non-finite values (stat_bin2d).